scom-org colab

if (T) {
# load packages
library("tidyverse")
library("quarto")
library("lme4")
# clear workspace
rm(list=ls())
}
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.0     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.1     ✔ tibble    3.1.8
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Loading required package: Matrix


Attaching package: 'Matrix'


The following objects are masked from 'package:tidyr':

    expand, pack, unpack
# working directory
if (!grepl("/qmd", getwd(), fixed=T)) {setwd("qmd/")}; getwd();
[1] "/home/sol-nhl/rnd/r/projects/scom-org/qmd"
# load functions
vals_normalize <- function(x,y) {return (data.frame(id=x, nv=(y - min(y)) / (max(y) - min(y))))}
# load fb post dataset
fn = "https://raw.githubusercontent.com/nils-holmberg/scom-org/main/csv/fb-sa-230314.csv"
fn = "../csv/fb-sa-230314.csv"
dtp = read.table(fn, sep='\t', header=T, strip.white=TRUE, stringsAsFactors=FALSE)
dtp = dtp |> as_tibble() |> mutate(month=month-30)

#
dtp |> names()
 [1] "id"         "text"       "org"        "time"       "lang"      
 [6] "follow"     "e_lc"       "e_cc"       "e_sc"       "e_rc"      
[11] "e_index"    "emoji"      "emnum"      "sa_val"     "sa_pos"    
[16] "sa_neg"     "sa_frq"     "group"      "date"       "bins"      
[21] "index"      "month"      "sa_int_abs" "sa_int"     "wc"        
[26] "out"        "org_type"  

240110: back to afinn

  • översätta till årtal + månad (t.ex. 2015-01)
  • valens, intensitet, wc, sent/wc, över tid
tmp = dtp |> as_tibble() |> 
select(c(19,22,27,14,23,25,11,6)) |>
mutate(sa_int_rel_wc=sa_int_abs/wc) |>
mutate(m0_e_index_rel_follow=e_index/follow) |>
#mutate(date=as.POSIXct(time, origin="1970-01-01", tz="UTC")) |>
mutate(date=as.Date(date)) |>
mutate(date=format(date, "%Y-%m")) |>
group_by(date, org_type) |>
summarize(
    m1_sa_val=mean(sa_val),
    m2_sa_int_abs=mean(sa_int_abs),
    m3_wc=mean(wc),
    m4_sa_int_rel_wc=mean(sa_int_rel_wc),
    #date=first(date)
    m5_e_index=mean(m0_e_index_rel_follow)
) 
`summarise()` has grouped output by 'date'. You can override using the
`.groups` argument.
#|> 
#mutate(date=ifelse(seq_along(date) %% 2 == 1, date, "")) |>
#pivot_longer(cols=3:6, names_to="measure", values_to="score")

tmp |> pivot_longer(cols=3:7, names_to="measure", values_to="score") |>
  ggplot(aes(x=date, y=score, group=org_type, color=as.factor(org_type))) +
  geom_point(size=1.5) + 
  geom_smooth(method=lm, linewidth=1.5, se=F) +
  #scale_x_date(date_breaks = "1 year", date_labels = "%Y") + 
  #scale_x_discrete(labels=ifelse(seq_along(tmp$date) %% 2 == 1, tmp$date, "")) +
  theme(axis.text.x=element_text(angle=90, hjust=1)) +
  facet_wrap(vars(measure), scales="free") +
  labs(
    title="measures by time",
    subtitle="sentiment measures and engagement",
    caption="sentiment is increasing over time, engagment is decreasing",
    x="date of post",
    y="score",
    color="organization type"
  ) +
  theme(
    #legend.position = "top",  # Reposition legend to top
    #legend.key.size = unit(1, "cm"),  # Change size of legend keys
    legend.text = element_text(size = 10, face = "bold")  # Change legend text appearance
  )
`geom_smooth()` using formula = 'y ~ x'

  • sentiment mått vs engagement index
  • sentiment extremer + exempel poster
tmp = dtp |> 
select(c(19,22,27,14,23,25,11,6)) |>
mutate(sa_int_rel_wc=sa_int_abs/wc) |>
mutate(m0_e_index_rel_follow=e_index/follow) |>
group_by(month) |>
summarize(
    m1_sa_val=mean(sa_val),
    m2_sa_int_abs=mean(sa_int_abs),
    m3_wc=mean(wc),
    m4_sa_int_rel_wc=mean(sa_int_rel_wc),
    #date=first(date)
    m0_e_index=mean(m0_e_index_rel_follow)
)

tmp |> pivot_longer(cols=2:5, names_to="measure", values_to="score") |>
  ggplot(aes(x=score, y=m0_e_index)) +
  geom_point(size=1.5) + 
  geom_smooth(method=lm, linewidth=1.5, se=F) +
  facet_wrap(~measure, scales="free", nrow = 2, ncol = 2) +
  labs(
    title="engagement by sentiment",
    subtitle="association between engagement and sentiment",
    caption="flat or negative trends",
    x="sentiment",
    y="engagement",
    #color="organization type"
  ) +
  theme(
    #legend.position = "top",  # Reposition legend to top
    #legend.key.size = unit(1, "cm"),  # Change size of legend keys
    legend.text = element_text(size = 10, face = "bold")  # Change legend text appearance
  )
`geom_smooth()` using formula = 'y ~ x'

240104: sentiment, top and bottom

# get sentence tokenized dataframe
fp = "../tmp/fb-sa-231228-010.tsv"
tmp = read.table(fp, sep='\t', quote="", comment.char="", header=T, strip.white=TRUE, stringsAsFactors=FALSE)

# Create a single result dataframe
result_df <- tmp |> left_join(dtp |> select(id, lang), by="id") |> filter(lang=="sv") |> 
  # Extract the 10 texts with the highest sentiment scores
  arrange(desc(sa_numeric), desc(sa_score)) %>%
  mutate(category = "top") %>%
  head(10) %>%
  
  # Bind with the 10 texts with the lowest sentiment scores
  bind_rows(tmp |> left_join(dtp |> select(id, lang), by="id") |> filter(lang=="sv") |>
  arrange(sa_numeric, desc(sa_score)) %>%
  mutate(category = "bottom") %>%
  head(10)) |>

  select(id, sentence, sa_label)

#install.packages('simplermarkdown', lib="~/lib/r-cran")
#library(simplermarkdown)
cat(simplermarkdown::md_table(result_df))
id sentence sa_label
rfsl.forbundet-10157198447557301 Happy pride! positive
Nordensark-3258235214234799 Thank you all for that! positive
rfsl.forbundet-10157211573442301 Tack grattis och happy pride! positive
kalmarstadsmission-3405318379530319 Det tycker vi e bra!🌏 Var med och bidra till hållbarhet - handla på Kalmar Stadsmission Second Hand - öppet imorgon lördag 10-15! positive
kfumsverige-3915350888535051 Så inspirerande och så viktigt! positive
Lakarmissionen-3129368217161452 Årets viktigaste insamling! positive
uppsalastadsmission-3176615612383966 Handla för andra och stötta en god sak i sommar! positive
raddabarnen-10157641596116794 Årets viktigaste böcker är här! positive
kalmarstadsmission-3193348157394010 Second hand till bästa pris!!! positive
RiksforbundetHjartLung-3554658201222473 Det har gått bra och varit värdefullt. positive
raddningsmissionen-10158140807240345 Fattigdom är ett stort problem världen över cirka 10% av världens befolkning lever i så kallad extrem fattigdom. negative
HRWSweden-1017159588800525 Det kritiska uttalandet leddes av Tysklands FN-ambassadör som bland annat tog upp Kinas polisbrutalitet mot demokratiprotester i Hong Kong och Tibet samt storskalig diskriminering av Kinas muslimska minoritet i provinsen Xinjiang. negative
eskilstunastadsmission-1762474967239976 Kanske tristess. negative
raddabarnen-10157498109326794 Situationen är desperat! negative
AnhorigasRiksforbund-1709548629184479 Där är den mest kritiska posten enligt mig den som visar att 51 procent av de som svarade i undersökningen Inte hade blivit erbjuden stöd och inte heller vetat vart de skulle vända sig. negative
AnhorigasRiksforbund-1769080763231265 Vi vill att användarna är kritiska till saker de tycker inte fungerar är otydligt formulerat saknas eller på andra sätt kan bli bättre! negative
rodakorset-3460295044001098 Stor explosion i Beirut – många döda och skadade. negative
AmnestySverige-10158372520624788 🇺🇸 Det dödliga polisvåldet och misslyckandet i att stoppa den systematiska rasismen i USA kränker svarta amerikaners mänskliga rättigheter inklusive rätten till liv och rätten att inte diskrimineras. negative
AmnestySverige-10158559101704788 Läget är kritiskt och nu KRÄVER VI KRAFTTAG FRÅN UD! negative
rfsu.se-3699723843390967 - Antitrans-allianser och den könsbekräftande vården - Corona - a crisis exploited by populists - Fittfakta deluxe! negative

240103: paper structure

fp = "../tmp/fb-sa-240103.csv"
dts = read.table(fp, sep='\t', header=T, strip.white=TRUE, stringsAsFactors=FALSE)

intro

  • RQ1a: how to measure sentiment (transformers)
  • RQ1b: factors that explain sentiment (time, org_type, followers, wordcount)
  • RQ2: effects of sentiment on engagement

methods

  • posts were split into sentences
  • sentences were analyzed with transformers (neg, neu, pos + confidence)
  • sentiment was aggregated per post (DV)
  • predictors were extracted (RQ1)
  • regression analysis with

results

# descriptive analysis
tmp = dts |> 
select(sa_numeric_sum, sa_val, time, follow, wc, org_type) |> 
pivot_longer(cols=1:5, names_to="name", values_to="value")
#facet plot bivariate
ggplot(tmp, aes(x=org_type, y=value)) +
  geom_bar(position='dodge', stat='summary', fun='mean') +
  facet_wrap(vars(name), scales="free")

tmp = dts |> 
select(sa_numeric_sum, sa_val, time, follow, wc, lang) |> 
pivot_longer(cols=1:5, names_to="name", values_to="value")
#facet plot bivariate
ggplot(tmp, aes(x=lang, y=value)) +
  geom_bar(position='dodge', stat='summary', fun='mean') +
  facet_wrap(vars(name), scales="free")

# inferential analysis
m1 = lmer(sa_numeric_sum ~ time + follow + wc + lang + (1|org_type), data=dts)
Warning: Some predictor variables are on very different scales: consider
rescaling
#m1 = lm(sa_numeric_sum ~ time + follow + wc + lang, data=dts)
#m1 = lmer(sa_val ~ time + follow + wc + lang + (1|org_type), data=dts)
#
summary(m1)
Linear mixed model fit by REML ['lmerMod']
Formula: sa_numeric_sum ~ time + follow + wc + lang + (1 | org_type)
   Data: dts

REML criterion at convergence: 452753.9

Scaled residuals: 
     Min       1Q   Median       3Q      Max 
-16.5404  -0.5866   0.0118   0.5874  22.5713 

Random effects:
 Groups   Name        Variance Std.Dev.
 org_type (Intercept) 0.3036   0.551   
 Residual             6.1671   2.483   
Number of obs: 97188, groups:  org_type, 8

Fixed effects:
              Estimate Std. Error t value
(Intercept)  2.255e+00  3.381e-01   6.671
time        -5.278e-10  1.793e-10  -2.943
follow      -3.457e-06  1.239e-07 -27.909
wc          -4.003e-04  1.632e-04  -2.452
langsv      -1.243e-01  3.271e-02  -3.801

Correlation of Fixed Effects:
       (Intr) time   follow wc    
time   -0.812                     
follow  0.009 -0.022              
wc      0.110 -0.147  0.009       
langsv -0.121  0.046 -0.035 -0.207
fit warnings:
Some predictor variables are on very different scales: consider rescaling

discussion

  • some points here

231229: transformer sentiment

# Load the necessary library
#library(readr)

# List of TSV file paths
#file_paths <- c('path/to/file1.tsv', 'path/to/file2.tsv', 'path/to/file3.tsv') # replace with #actual file paths

# Define the directory containing the TSV files
directory <- "../csv" # replace with the actual directory path

# Define the pattern to match numbered TSV files with two leading zeroes
pattern <- "fb-sa-231228-0[0-9]+\\.tsv$" # This pattern matches files like 'file_001.tsv', 'file_002.tsv', etc.

# Create a list of file paths for TSV files matching the pattern
file_paths <- list.files(directory, pattern = pattern, full.names = TRUE)

# Print the list of file paths
print(file_paths)
 [1] "../csv/fb-sa-231228-001.tsv" "../csv/fb-sa-231228-002.tsv"
 [3] "../csv/fb-sa-231228-003.tsv" "../csv/fb-sa-231228-004.tsv"
 [5] "../csv/fb-sa-231228-005.tsv" "../csv/fb-sa-231228-006.tsv"
 [7] "../csv/fb-sa-231228-007.tsv" "../csv/fb-sa-231228-008.tsv"
 [9] "../csv/fb-sa-231228-009.tsv" "../csv/fb-sa-231228-010.tsv"
################################################

# Function to read each TSV file into a dataframe
read_tsv_file <- function(fn) {
#  read_tsv(file_path)
  read.table(fn, sep='\t', header=T, strip.white=TRUE, stringsAsFactors=FALSE)
}

# Read each file and store the dataframes in a list
list_of_dfs <- lapply(file_paths, read_tsv_file)

# Combine all dataframes into one
combined_df <- do.call(rbind, list_of_dfs)

# Assuming you have a dataframe df with an ID column named 'id'
# and other columns you want to aggregate

# Separating duplicates and uniques
duplicates <- combined_df %>% group_by(id) %>% filter(n() > 1)
uniques <- combined_df %>% group_by(id) %>% filter(n() == 1) %>% ungroup()

# Perform your desired aggregation on the duplicates
# For example, if you want to calculate the mean of a column named 'value'
aggregated_duplicates <- duplicates %>% group_by(id) %>%
summarize(
sa_numeric_mean=mean(sa_numeric_mean),
sa_numeric_sum=sum(sa_numeric_sum),
sa_scaled_mean=mean(sa_scaled_mean),
sa_scaled_sum=mean(sa_scaled_sum),
sentence_count=max(sentence_count)
) %>%
ungroup()

# Bind the aggregated duplicates back with the uniques
combined_df <- bind_rows(aggregated_duplicates, uniques)

# View the final dataframe
dim(combined_df)
[1] 97188     6
# View the combined dataframe
#paste(dim(dtp), dim(combined_df))

# Find duplicates in the specified column
#dtp[duplicated(dtp$id) | duplicated(dtp$id, fromLast = TRUE), ]
#combined_df[duplicated(combined_df$id) | duplicated(combined_df$id, fromLast = TRUE), ]
#fn = "../csv/fb-sa-231228-001.tsv"
#tmp = read.table(fn, sep='\t', header=T, strip.white=TRUE, stringsAsFactors=FALSE)

# Find rows in df1 that don't have matching keys in df2
#non_matching_df1 <- anti_join(dtp, combined_df, by = "id")
# Find rows in df2 that don't have matching keys in df1
#non_matching_df2 <- anti_join(combined_df, dtp, by = "id")
# View the results
#print(non_matching_df1)
#print(non_matching_df2)

# perform inner join
dts = inner_join(combined_df, dtp, by="id") |> as_tibble()

dts |> names()
 [1] "id"              "sa_numeric_mean" "sa_numeric_sum"  "sa_scaled_mean" 
 [5] "sa_scaled_sum"   "sentence_count"  "text"            "org"            
 [9] "time"            "lang"            "follow"          "e_lc"           
[13] "e_cc"            "e_sc"            "e_rc"            "e_index"        
[17] "emoji"           "emnum"           "sa_val"          "sa_pos"         
[21] "sa_neg"          "sa_frq"          "group"           "date"           
[25] "bins"            "index"           "month"           "sa_int_abs"     
[29] "sa_int"          "wc"              "out"             "org_type"       
#write.table(dts, "../tmp/fb-sa-240103.csv", sep="\t", quot=T, row.names=F)

long_df <- dts |> select(27,32,19,3,5) |> 
#slice_sample(n=10000) |> 
group_by(month, org_type) |> 
summarize(afinn=mean(sa_val), trans1=mean(sa_numeric_sum), trans2=mean(sa_scaled_sum)) |> 
#rename(series1=sa_numeric_sum, series2=sa_val) |> 
#gather(key="series", value="value", -time)
pivot_longer(cols=3:5, names_to="sa", values_to="score") 
`summarise()` has grouped output by 'month'. You can override using the
`.groups` argument.
# Plotting with ggplot2
ggplot(long_df, aes(x=month, y=score, group=org_type, color=as.factor(org_type))) +
  geom_point(size=0.5) + 
  geom_smooth(method=lm, linewidth=0.5, se=F) +
  facet_wrap(vars(sa))#, scales="free")
`geom_smooth()` using formula = 'y ~ x'

# lme modelling
m1 = lm(sa_val ~ time + wc, data=dts)
#
summary(m1)

Call:
lm(formula = sa_val ~ time + wc, data = dts)

Residuals:
    Min      1Q  Median      3Q     Max 
-95.699  -2.051  -0.762   2.204 142.911 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  4.240e+00  5.406e-01   7.843 4.44e-15 ***
time        -2.159e-09  3.547e-10  -6.088 1.15e-09 ***
wc           2.293e-02  3.165e-04  72.442  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.951 on 97185 degrees of freedom
Multiple R-squared:  0.05136,   Adjusted R-squared:  0.05134 
F-statistic:  2631 on 2 and 97185 DF,  p-value: < 2.2e-16

sentiment valence over time

tmp <- dts |> 
mutate(sa_val_rel=sa_val/(sa_frq+0.01)) |>
select(27,19,3,33) |> 
group_by(month) |> 
summarize(afinn=mean(sa_val), afinn_new=mean(sa_val_rel), trans1=mean(sa_numeric_sum)) |> 
pivot_longer(cols=2:4, names_to="sa", values_to="score") 

ggplot(tmp, aes(x=month, y=score, group=sa, color=sa)) +
  geom_point(size=0.5) + 
  geom_smooth(method=lm, linewidth=0.5, se=F)
`geom_smooth()` using formula = 'y ~ x'

tmp <- dts |> 
select(27,6,30) |> 
group_by(month) |> 
summarize(sc=mean(sentence_count), wc=mean(wc)) |> 
pivot_longer(cols=2:3, names_to="measure", values_to="score") |>
ggplot(aes(x=month, y=score, group=measure, color=measure)) +
geom_point(size=0.5) + 
geom_smooth(method=lm, linewidth=0.5, se=F)

231228: summarize results

dtp |> as_tibble() |> 
select(c(22,27,13,14,17,23,24,25)) |>
group_by(month, org_type) |>
summarize_at(vars(1:6), list(mean=mean)) |> 
pivot_longer(cols=3:8, names_to="sa", values_to="score") |>
  ggplot(aes(x=month, y=score, group=org_type, color=as.factor(org_type))) +
  geom_point(size=0.5) + 
  geom_smooth(method=lm, linewidth=0.5, se=T) +
  facet_wrap(vars(sa), scales="free")
`geom_smooth()` using formula = 'y ~ x'

#  geom_smooth(method="loess", size=2, se=T)
# lme modelling
m1 = lmer(e_index ~ sa_val + (1|org_type), data=dtp)
#
summary(m1)
Linear mixed model fit by REML ['lmerMod']
Formula: e_index ~ sa_val + (1 | org_type)
   Data: dtp

REML criterion at convergence: 1612004

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
 -0.461  -0.235  -0.115  -0.030 112.746 

Random effects:
 Groups   Name        Variance Std.Dev.
 org_type (Intercept)  17844   133.6   
 Residual             934975   966.9   
Number of obs: 97188, groups:  org_type, 8

Fixed effects:
            Estimate Std. Error t value
(Intercept) 215.7997    47.4378   4.549
sa_val       -1.7923     0.6144  -2.917

Correlation of Fixed Effects:
       (Intr)
sa_val -0.034
# lme modelling
m2 = lmer(e_index ~ sa_val + follow + month + (1|org_type), data=dtp)
Warning: Some predictor variables are on very different scales: consider
rescaling
#
summary(m2)
Linear mixed model fit by REML ['lmerMod']
Formula: e_index ~ sa_val + follow + month + (1 | org_type)
   Data: dtp

REML criterion at convergence: 1607403

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
 -2.177  -0.182  -0.091   0.009 115.298 

Random effects:
 Groups   Name        Variance Std.Dev.
 org_type (Intercept)  11092   105.3   
 Residual             891609   944.3   
Number of obs: 97188, groups:  org_type, 8

Fixed effects:
              Estimate Std. Error t value
(Intercept)  2.269e+02  3.796e+01   5.978
sa_val       1.451e+00  6.019e-01   2.411
follow       3.122e-03  4.720e-05  66.132
month       -3.611e+00  1.773e-01 -20.361

Correlation of Fixed Effects:
       (Intr) sa_val follow
sa_val -0.043              
follow -0.039  0.079       
month  -0.152 -0.011 -0.021
fit warnings:
Some predictor variables are on very different scales: consider rescaling
#dtp |> slice(1000) 
#select(org_type, org) |> 
#group_by(org_type) |> 
#full_join(data.frame(org_type=1:10), by=c("org_type"), keep=T) |>
#summarize(count_distinct=n_distinct(org))

# plot theme
t_s = 18
gt = theme(
strip.text = element_text(size = t_s),
plot.title = element_text(family = "Helvetica", face = "bold", size = (t_s)), 
legend.title = element_text(size = (t_s), colour = "black",  face = "bold.italic", family = "Helvetica"), 
legend.text = element_text(size = (t_s), face = "italic", colour="black",family = "Helvetica"), 
#legend.key.size = unit(1.5, "cm"),
axis.title = element_text(family = "Helvetica", size = (t_s), colour = "black", face = "bold"),
axis.text = element_text(family = "Helvetica", colour = "black", size = (t_s)),
#aspect.ratio=4/3
)

# wide to long format
tmp = dtp |> 
select(c(1,27,6,11,14,24)) |> 
pivot_longer(cols=3:6, names_to="measure", values_to="value")

if (F) {
# descriptives
library(plyr)
bp1 <- ddply(dtp[,c(11,27)], c("org_type"), summarise,
N=length(e_index), mean=mean(e_index), sd=sd(e_index), se=sd/sqrt(N))
bp1

#facet plot bivariate
p = ggplot(tmp, aes(x=as.factor(org_type), y=value)) +
  geom_bar(position='dodge', stat='summary', fun='mean') +
  facet_wrap(vars(measure), scales="free") +
  gt
p
}

230329: sentiment analysis

dtp |> select(org_type, org) |> group_by(org_type) |> summarize(count_distinct=n_distinct(org))
# A tibble: 8 × 2
  org_type count_distinct
     <int>          <int>
1        1              5
2        2              9
3        3             25
4        4             28
5        5              8
6        6              4
7        7             19
8        9             27
if (F) {
knitr::knit_exit()
#exit()
#q()
#stop("here..")
}

230523: linear mixed effects approach

# ivs trend lines
tmp = dtp |> as_tibble() |> select(c(27,22,14)) |> 
group_by(org_type, month) |> summarize(sa_val_mean=mean(sa_val))
`summarise()` has grouped output by 'org_type'. You can override using the
`.groups` argument.
# plot by org_type
tmp |> 
  ggplot(aes(month, sa_val_mean, color=as.factor(org_type))) +
  geom_point() + 
  geom_smooth(method=lm, se=FALSE) + 
  labs(x="month", y="sa_val_mean")
`geom_smooth()` using formula = 'y ~ x'

# aggregate by org_type, month
tmp = left_join(dtp, dtp |> 
# normalize
select(c(1,3,22,14)) |> group_by(org) |> group_modify(~ vals_normalize(.x$id, .x$sa_val)) |> rename(sa_val_norm=nv), by=c("id")) |> 
# aggregate
select(1,3,27,22,14,24,6,29) |> group_by(org_type, month) |> summarize(sa_val_mean=mean(sa_val), sa_val_norm_mean=mean(sa_val_norm), sa_int_mean=mean(sa_int), follow_mean=mean(follow))
`summarise()` has grouped output by 'org_type'. You can override using the
`.groups` argument.
# ivs, wide to long
tmp = tmp |> 
pivot_longer(cols=3:6, names_to="ivs", values_to="score")
# points with trend line
tmp |> 
  ggplot(aes(x=month, y=score, group=org_type, color=as.factor(org_type))) +
  geom_point() + 
  geom_smooth(method=lm, se=FALSE) + 
  facet_wrap(vars(ivs), scales="free") +
  labs(title="engagement trend by org_type and measure", x="month", y="score")
`geom_smooth()` using formula = 'y ~ x'

independent variables, ivs

# dv1
tmp0 = dtp |> 
select(c(1,3,22,11)) |> 
group_by(org) |> 
group_modify(~ vals_normalize(.x$id, .x$e_index)) |>
rename(e_index_norm=nv)
# iv1
tmp1 = dtp |> 
select(c(1,3,22,14)) |> 
group_by(org) |> 
group_modify(~ vals_normalize(.x$id, .x$sa_val)) |>
rename(sa_val_norm=nv)
# iv2
tmp2 = dtp |> 
select(c(1,3,22,24)) |> 
group_by(org) |> 
group_modify(~ vals_normalize(.x$id, .x$sa_int)) |>
rename(sa_int_norm=nv)
# iv3
tmp3 = dtp |> 
select(c(1,3,22,6)) |> 
group_by(org) |> 
group_modify(~ vals_normalize(.x$id, .x$follow)) |>
rename(follow_norm=nv)
# iv4
tmp4 = vals_normalize(dtp$id, dtp$time) |> as_tibble() |> rename(time_norm=nv)
# combine
tmp = tmp0 |> 
left_join(dtp |> select(id, month, org_type, out), by=c("id")) |> 
left_join(tmp1, by=c("id")) |> 
left_join(tmp2, by=c("id")) |> 
left_join(tmp3, by=c("id")) |> 
left_join(tmp4, by=c("id")) |> rename(org=org.x)
#
# lme modelling
m1 = lmer(data=tmp, subset=out!=1, e_index_norm ~ sa_val_norm + sa_int_norm + follow_norm + time_norm + (1|org))
summary(m1)
Linear mixed model fit by REML ['lmerMod']
Formula: e_index_norm ~ sa_val_norm + sa_int_norm + follow_norm + time_norm +  
    (1 | org)
   Data: tmp
 Subset: out != 1

REML criterion at convergence: -150807.5

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-2.5139 -0.3762 -0.1564  0.0400 12.1850 

Random effects:
 Groups   Name        Variance Std.Dev.
 org      (Intercept) 0.001807 0.0425  
 Residual             0.006609 0.0813  
Number of obs: 69434, groups:  org, 125

Fixed effects:
             Estimate Std. Error t value
(Intercept)  0.055664   0.004127  13.487
sa_val_norm  0.010318   0.003454   2.987
sa_int_norm  0.028773   0.004347   6.619
follow_norm  0.004400   0.002241   1.963
time_norm   -0.010522   0.002407  -4.371

Correlation of Fixed Effects:
            (Intr) s_vl_n s_nt_n fllw_n
sa_val_norm -0.341                     
sa_int_norm  0.167 -0.553              
follow_norm -0.031 -0.007  0.007       
time_norm   -0.046  0.013 -0.028 -0.887
#m2 = lmer(data=tmp, e_index_norm ~ sa_val_norm + sa_int_norm + follow_norm + (1|org_type) + (1|month))
#m2 = lmer(data=tmp, e_index_norm ~ sa_val_norm + sa_int_norm + follow_norm + (1|month))
#summary(m2)

230418: growth analysis (cont.)

# load functions
source("socm_sa_functions.R")
# aggregate by org
tmp = dvs_org_aggregate(dtp)
`summarise()` has grouped output by 'org'. You can override using the `.groups`
argument.
`summarise()` has grouped output by 'org'. You can override using the `.groups`
argument.
`summarise()` has grouped output by 'org'. You can override using the `.groups`
argument.
# get org_type
tmp = tmp |> group_by(org) |> mutate(id=row_number()) |> 
left_join(dtp |> select(3, 27) |> group_by(org) |> mutate(id=row_number()), by=c("org","id"))
# aggregate by org_type
tmp = tmp |> group_by(org_type, month) |> summarize(e_index_mean=mean(e_index_mean), e_index_rel_mean=mean(e_index_rel_mean), e_index_norm_mean=mean(e_index_norm_mean), e_index_mean_norm=mean(e_index_mean_norm))
`summarise()` has grouped output by 'org_type'. You can override using the
`.groups` argument.
# dvs, wide to long
tmp = tmp |> 
pivot_longer(cols=3:6, names_to="dvs", values_to="e_index")
# points with trend line
tmp |> 
  ggplot(aes(x=month, y=e_index, group=org_type, color=as.factor(org_type))) +
  geom_point() + 
  geom_smooth(method=lm, se=FALSE) + 
  facet_wrap(vars(dvs), scales="free") +
  theme_bw() + 
  labs(title="engagement trend by org_type and measure", x="month", y="e_index")
`geom_smooth()` using formula = 'y ~ x'

heatmap

# heatmap e_index by time and org type
hm1 = tmp |> filter(dvs=="e_index_mean") |> ggplot(aes(x=month, y=as.factor(org_type))) + geom_raster(aes(fill=e_index))
hm2 = tmp |> filter(dvs=="e_index_mean_norm") |> ggplot(aes(x=month, y=as.factor(org_type))) + geom_raster(aes(fill=e_index))
hm3 = tmp |> filter(dvs=="e_index_norm_mean") |> ggplot(aes(x=month, y=as.factor(org_type))) + geom_raster(aes(fill=e_index))
hm4 = tmp |> filter(dvs=="e_index_rel_mean") |> ggplot(aes(x=month, y=as.factor(org_type))) + geom_raster(aes(fill=e_index))
#
hm = gridExtra::grid.arrange(hm1, hm2, hm3, hm4, ncol=2, nrow=2)

#
hm
TableGrob (2 x 2) "arrange": 4 grobs
  z     cells    name           grob
1 1 (1-1,1-1) arrange gtable[layout]
2 2 (1-1,2-2) arrange gtable[layout]
3 3 (2-2,1-1) arrange gtable[layout]
4 4 (2-2,2-2) arrange gtable[layout]
# long format, 8 org_type \* 60 months
tmp = dtp %>% as_tibble() %>% select(c(27,22,11)) %>% 
group_by(org_type, month) %>% summarize(e_index_mean=mean(e_index))
`summarise()` has grouped output by 'org_type'. You can override using the
`.groups` argument.
#
tmp %>% 
  ggplot(aes(month, e_index_mean, color=as.factor(org_type))) +
  geom_point() + 
  geom_smooth(method=lm, se=FALSE) + 
  theme_bw() + 
  labs(x="month", y="e_index_mean")
`geom_smooth()` using formula = 'y ~ x'

230322: growth analysis

#engagement over time
tmp = dtp %>% select(c(1,27,22,11)) %>% head()
#facet plot bivariate
#p = ggplot(tmp, aes(x=org_type, y=value)) +
#  geom_bar(position='dodge', stat='summary', fun='mean') +
#  facet_wrap(vars(measure), scales="free")
#p
# dv1
tmp1 = dtp %>% select(c(3,22,11)) %>% 
group_by(org, month) %>% summarize(e_index_mean=mean(e_index))
`summarise()` has grouped output by 'org'. You can override using the `.groups`
argument.
# dv2
tmp2 = dtp %>% mutate(e_index_rel=e_index/follow) %>% select(c(3,22,28)) %>% 
group_by(org, month) %>% summarize(e_index_rel_mean=mean(e_index_rel))
`summarise()` has grouped output by 'org'. You can override using the `.groups`
argument.
# dv3
tmp3 = left_join(dtp %>% select(c(1,3,22,11)), dtp %>% select(c(1,3,22,11)) %>% 
group_by(org) %>% group_modify(~ vals_normalize(.x$id,.x$e_index)), by=c("org","id")) %>% 
select(c(2,3,5)) %>% rename(e_index_norm=nv) %>% 
group_by(org, month) %>% summarize(e_index_norm_mean=mean(e_index_norm))
`summarise()` has grouped output by 'org'. You can override using the `.groups`
argument.
# dv4
tmp4 = left_join(tmp1, tmp1 %>% 
group_by(org) %>% group_modify(~ vals_normalize(.x$month,.x$e_index_mean)) %>% 
rename(month=id, e_index_mean_norm=nv), by=c("org","month")) %>% select(-c(3))
# dvs, join multiple
tmp = tmp1 %>% 
left_join(tmp2, by=c("org","month")) %>% 
left_join(tmp3, by=c("org","month")) %>% 
left_join(tmp4, by=c("org","month"))
# dvs, wide to long
tmp = tmp %>% 
pivot_longer(cols=3:6, names_to="dvs", values_to="e_index")

engagement index, dependent variable

  1. e_index_mean: per post e_index average per month (“raw engagement scores”)
  2. e_index_mean_norm: normalized per organization values of e_index_mean (1. above)
  3. e_index_norm_mean: per post e_index normalized per organization average per month
  4. e_index_rel_mean: per post e_index relative to follower count average per month
# line plot
lp = ggplot(
data=tmp %>% filter(org %in% c("ClownerutanGranser","lakareutangranser")), 
aes(x=month, y=e_index, group=org, color=org)) +
  geom_line(linewidth=1) +
  facet_wrap(vars(dvs), scales="free") +
  labs(title="engagement by org and measure")
lp

#  facet_wrap(~dvs)
#  facet_grid(. ~ dvs)

long to wide format

dtw = dtp %>% select(c(3,22,11)) %>% 
group_by(org, month) %>% summarize(e_index_mean=mean(e_index)) %>% 
pivot_wider(names_from=month, names_glue="month_{month}", values_from=e_index_mean)
`summarise()` has grouped output by 'org'. You can override using the `.groups`
argument.
# latent growth model, sem approach
library(lavaan)
This is lavaan 0.6-15
lavaan is FREE software! Please report any bugs.
#dtw %>% names() %>% sort() %>% paste0("1*",.," +", collapse=' ')
#cnt=0;for (i in dtw %>% names() %>% sort()){cat(paste0(cnt,"*",i," + "));cnt=cnt+1;}
model <- '
# intercept
i =~ 
1*month_31 + 1*month_32 + 1*month_33 + 1*month_34 + 1*month_35 + 1*month_36 + 1*month_37 + 1*month_38 + 1*month_39 + 1*month_40 + 1*month_41 + 1*month_42 + 1*month_43 + 1*month_44 + 1*month_45 + 1*month_46 + 1*month_47 + 1*month_48 + 1*month_49 + 1*month_50 + 1*month_51 + 1*month_52 + 1*month_53 + 1*month_54 + 1*month_55 + 1*month_56 + 1*month_57 + 1*month_58 + 1*month_59 + 1*month_60 + 1*month_61 + 1*month_62 + 1*month_63 + 1*month_64 + 1*month_65 + 1*month_66 + 1*month_67 + 1*month_68 + 1*month_69 + 1*month_70 + 1*month_71 + 1*month_72 + 1*month_73 + 1*month_74 + 1*month_75 + 1*month_76 + 1*month_77 + 1*month_78 + 1*month_79 + 1*month_80 + 1*month_81 + 1*month_82 + 1*month_83 + 1*month_84 + 1*month_85 + 1*month_86 + 1*month_87 + 1*month_88 + 1*month_89 + 1*month_90 
# slope
s =~ 
0*month_31 + 1*month_32 + 2*month_33 + 3*month_34 + 4*month_35 + 5*month_36 + 6*month_37 + 7*month_38 + 8*month_39 + 9*month_40 + 10*month_41 + 11*month_42 + 12*month_43 + 13*month_44 + 14*month_45 + 15*month_46 + 16*month_47 + 17*month_48 + 18*month_49 + 19*month_50 + 20*month_51 + 21*month_52 + 22*month_53 + 23*month_54 + 24*month_55 + 25*month_56 + 26*month_57 + 27*month_58 + 28*month_59 + 29*month_60 + 30*month_61 + 31*month_62 + 32*month_63 + 33*month_64 + 34*month_65 + 35*month_66 + 36*month_67 + 37*month_68 + 38*month_69 + 39*month_70 + 40*month_71 + 41*month_72 + 42*month_73 + 43*month_74 + 44*month_75 + 45*month_76 + 46*month_77 + 47*month_78 + 48*month_79 + 49*month_80 + 50*month_81 + 51*month_82 + 52*month_83 + 53*month_84 + 54*month_85 + 55*month_86 + 56*month_87 + 57*month_88 + 58*month_89 + 59*month_90
'
#
#fit1 <- growth(model, data=dtw)
#summary(fit1, standardized=TRUE)
# visualize path diagram
#install.packages('tidySEM', lib="~/lib/r-cran", dependencies=TRUE)
library(tidySEM)
Loading required package: OpenMx
To take full advantage of multiple cores, use:
  mxOption(key='Number of Threads', value=parallel::detectCores()) #now
  Sys.setenv(OMP_NUM_THREADS=parallel::detectCores()) #before library(OpenMx)

Attaching package: 'OpenMx'

The following objects are masked from 'package:Matrix':

    %&%, expm
#graph_sem(model=fit1)
#
#library(lavaanPlot)
#lavaanPlot(model=fit1)
#
#library("DiagrammeR")
#library("semPlot")
#semPaths(fit1, intercept = FALSE, whatLabel = "est", residuals = FALSE, exoCov = FALSE)

wide to long format

dtl = dtw %>% 
pivot_longer(cols=2:61, names_to="month", values_to="e_index_mean") %>%
mutate(month=str_extract(month, "[0-9]+$")) %>% mutate_at(c("month"), as.numeric)
# line plot
lp = ggplot(
data = dtl %>% filter(org %in% c("ClownerutanGranser","lakareutangranser")), 
aes(x=month, y=e_index_mean, group=org, color=org)) +
  geom_line()
lp

dtl %>% 
  filter(org %in% c("ClownerutanGranser","lakareutangranser")) %>% # select just two orgs
  ggplot(aes(month, e_index_mean, color = org)) +
  geom_point() + # points for observations of engagement
  geom_smooth(method = lm, se = FALSE) + # linear line
  theme_bw() + # nice theme
  labs(x = "month", y = "e_index_mean") # nice labels
`geom_smooth()` using formula = 'y ~ x'

230315: bivariate org type

#wide to long format
tmp = dtp %>% select(c(1,27,6,11,14,24)) %>% pivot_longer(cols=3:6, names_to="measure", values_to="value")
#facet plot bivariate
p = ggplot(tmp, aes(x=org_type, y=value)) +
  geom_bar(position='dodge', stat='summary', fun='mean') +
  facet_wrap(vars(measure), scales="free")
p

230314: resume analysis

if (T) {
knitr::knit_exit()
#exit()
#q()
#stop("here..")
}